library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}


Load Dataset


clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(Module = clusterings$Module, gene.score = as.character(gene.score)) %>%
                      mutate(gene.score = ifelse(gene.score=='Others', 'None', gene.score)) %>%
          dplyr::select(-matches(clustering_selected))
rownames(dataset) = rownames_dataset

# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
  # Gandal dataset
  load('./../Data/preprocessed_data.RData')
  datExpr = datExpr %>% data.frame
  
  for(g in GS_missing){
    dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
}



# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)


rm(getinfo, mart, rownames_dataset, GO_annotations, g, GS_missing)


The features that will be considered for the classification model will be the ones WGCNA uses to identify significant modules and genes:



Filtering the 1830 genes that were not assigned to any cluster (represented as the gray cluster)

rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>% 
              dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

rm(rm_cluster)


Summary of the changes made to the original WGCNA variables:


  • Using Module Membership variables instead of binary module membership

  • Including a new variable with the absolute value of GS

  • Removing genes assigned to the gray module (unclassified genes)

  • Adding the Objective variable: Binary label indicating if it’s in the SFARI dataset or not

table_info = new_dataset %>% head(5) %>% t 

table_info %>% kable(caption = '(Transposed) features and their values for the first rows of dataset', 
                     col.names = colnames(table_info)) %>% kable_styling(full_width = F)
(Transposed) features and their values for the first rows of dataset
ENSG00000174840 ENSG00000121410 ENSG00000078328 ENSG00000175899 ENSG00000166535
MTcor 0.0832482 -0.2137853 -0.9145290 -0.3034897 0.2190318
GS 0.0946359 -0.2088418 -0.4470894 -0.2260618 0.1485608
MM.DE8C00 -0.0813327 -0.0562470 -0.0427428 0.2965456 0.0362411
MM.6FB000 -0.4548321 -0.1166111 -0.3156040 -0.3129274 -0.0840447
MM.00C086 -0.2493116 -0.2158016 -0.1905838 -0.4318493 -0.0385317
MM.F57963 -0.4332353 -0.0098177 -0.0655534 -0.0063333 -0.1432524
MM.DA8E00 0.0768426 -0.0539677 -0.0741071 -0.1747240 -0.2641453
MM.00BDD4 0.2332492 0.0207811 0.0180752 -0.2393298 -0.0028267
MM.00BF74 0.1102612 -0.0487173 -0.1954640 -0.2362997 -0.0288798
MM.00A7FF -0.0627492 -0.0140640 -0.1196216 -0.2903817 -0.2081182
MM.5A9DFF -0.1005093 0.0912025 0.0866405 -0.2145973 -0.0720524
MM.00C08E 0.1686922 -0.0506604 -0.2431874 -0.1955273 0.0081254
MM.FF64B1 0.2028247 -0.0198433 -0.0208405 -0.2608037 0.1219179
MM.D69100 -0.1755946 0.1545721 -0.0156458 0.0452504 -0.2916027
MM.BC81FF 0.1803425 0.0430306 -0.0049419 0.2805490 -0.1306227
MM.00AAFD 0.0359790 -0.0001570 -0.4124447 0.1065160 0.0361258
MM.9CA700 0.0459371 -0.1639700 -0.4440864 -0.0686088 -0.0172798
MM.00BB4B 0.3436858 -0.0195388 -0.3165312 0.0396536 0.0836749
MM.A789FF 0.1747797 0.0790658 -0.2700054 0.1450093 0.0306923
MM.8E92FF 0.0610536 -0.1444906 -0.2406208 -0.1504703 -0.0595903
MM.00BECE -0.1867569 -0.2473776 -0.4344758 -0.4140629 -0.1093526
MM.79AF00 -0.0068428 -0.1944098 -0.5512190 -0.2593433 -0.0956816
MM.00BF7D -0.0250652 -0.2851167 -0.5991594 -0.1310486 0.1450658
MM.2CB600 -0.1626872 -0.3090055 -0.5227051 -0.2438819 0.0868732
MM.8CAB00 0.0066877 -0.1345929 -0.5078961 -0.3926168 -0.0442148
MM.E68710 0.0442406 -0.2147079 -0.5732910 -0.3120124 -0.1342483
MM.00BCDA -0.1286629 -0.1446787 -0.4015763 -0.3459154 0.1543522
MM.7F96FF -0.1524808 -0.0911292 -0.3353314 -0.5333744 -0.0181422
MM.E28900 -0.2106204 0.1352302 0.1082066 0.1389685 -0.1830438
MM.00A4FF 0.0573277 0.1818729 -0.1196505 0.0848474 0.1758487
MM.D575FE -0.3219647 0.1767890 -0.1421686 0.1295538 0.0016537
MM.FF6C91 -0.4037728 0.3468478 0.3071323 0.4035206 0.0485337
MM.00C19E -0.1522857 -0.0555546 -0.1482107 -0.2722913 -0.0711533
MM.C79800 0.0309320 -0.0698142 0.1716357 -0.2161322 -0.0679629
MM.00B5EE 0.0560484 -0.0355969 -0.2069813 -0.3117382 0.0714419
MM.6E99FF -0.1884645 -0.0846878 -0.3388231 -0.2795725 0.1314537
MM.00C0B4 -0.1010749 -0.2027009 -0.0762083 -0.3378813 -0.0301776
MM.BC9D00 -0.2162542 0.0387011 0.1497072 -0.3360844 -0.0780279
MM.CD79FF -0.1836296 0.0684606 0.1001112 -0.3119923 -0.0364334
MM.F07E4B -0.0918536 -0.0007418 0.2622670 -0.0906417 0.1928493
MM.FD61D3 -0.1648623 0.1058354 0.4004618 -0.2103919 0.0696020
MM.55B300 -0.0992350 -0.0748253 0.1283496 -0.4167064 -0.0308686
MM.FF63B8 -0.2231235 -0.0097780 0.2169574 -0.3204277 -0.1182891
MM.F8766D 0.0018065 0.1787646 0.0972655 0.1169564 0.0469163
MM.ED813E 0.1747014 0.1500265 0.3751523 -0.0292345 0.1926426
MM.F863DF 0.0578081 0.2341973 0.3536940 -0.0976476 0.0539418
MM.00B7E9 0.1485387 -0.0559437 -0.0679460 -0.2627636 0.1747425
MM.94A900 0.1403746 -0.0928804 0.0697380 -0.3723450 0.0959963
MM.00ADFA 0.1639550 0.1062030 0.5221455 0.0800016 -0.1757568
MM.83AD00 0.0873033 0.2833470 0.3423678 0.1297448 -0.0954132
MM.44B500 0.3713062 -0.0869081 0.2244425 0.0569648 0.0842512
MM.DB72FB 0.4066223 0.0136957 0.1942735 -0.0255669 -0.0923165
MM.E9842C 0.4726265 -0.0762786 0.0989248 -0.1288009 0.0674142
MM.00BD61 0.5210802 0.0107588 0.0010740 0.2004524 0.0926400
MM.C29B00 0.4831548 0.1114200 0.3375801 0.1484716 0.1278595
MM.00BA3D 0.1682624 0.1315209 0.1374211 0.0476274 -0.1058637
MM.FF62BF -0.0557176 0.1265554 0.0988161 0.0112068 -0.0510165
MM.00BE6B 0.0631851 -0.0834246 -0.2719844 0.3030880 0.0016276
MM.D19300 0.1417563 -0.0884539 -0.3787281 0.2912178 -0.0484254
MM.00B92D 0.3595897 -0.0279321 -0.3241466 -0.0835873 -0.0642214
MM.AAA300 0.4640766 0.0445820 0.0295084 0.2533439 -0.0879892
MM.00BADF 0.2055705 0.1287859 0.2109354 -0.0232069 -0.1623422
MM.00C0BA 0.1124727 -0.1240675 0.1433735 0.1223364 -0.1070138
MM.00C1AD 0.0754203 -0.0627419 -0.3283389 0.2385344 0.1154950
MM.B0A100 -0.0508591 0.1454518 0.0394742 0.3385572 -0.0889242
MM.FF61C6 -0.1501804 -0.0803551 -0.1479026 0.0553914 0.0111525
MM.3EA1FF 0.2304045 0.2117181 0.0220890 0.2721439 0.1484570
MM.FA7477 0.2540248 0.2713906 0.3562823 0.3399675 0.2209892
MM.C57DFF -0.0004470 -0.0423414 0.0046082 0.4714587 0.0014977
MM.CC9600 0.2016330 0.0309888 0.0323509 0.3465395 0.1563802
MM.00C1A5 0.0184487 -0.0328614 0.1292002 -0.0627340 0.0678604
MM.FE61CD -0.0033463 -0.0425762 -0.2637903 -0.0157298 0.0716888
MM.F464E4 -0.2744923 -0.1487886 -0.1713998 -0.1130489 0.0866947
MM.B69F00 -0.0726044 -0.1572332 -0.0643350 0.1028033 0.0131138
MM.F066EA -0.0787652 -0.0752724 0.1268331 0.0168737 0.1381884
MM.FA62D9 0.0148696 0.0636646 -0.2874563 -0.1779839 -0.0072724
MM.00BFC1 0.0723657 0.0396408 0.0664050 0.2632726 0.1231384
MM.FC7180 0.1836392 -0.1895422 -0.4006710 -0.0123738 0.2717846
MM.00B8E4 0.2255110 0.0652075 0.1777184 0.2407287 0.0180128
MM.00B0F6 0.3100602 -0.1014479 0.2722409 0.0673448 -0.0765110
MM.FF66A9 0.0445709 0.0530329 0.4204220 0.2318865 -0.2417528
MM.00BC56 -0.0563847 0.2181936 0.4328106 0.1377576 -0.2324858
MM.00BFC8 -0.0481471 0.0472750 0.3811959 -0.1479967 -0.1800069
MM.E26EF7 -0.0698042 0.2161273 0.5563800 0.3859588 0.0372900
MM.00C096 -0.0875981 0.0454643 0.4424028 0.1957238 0.1606864
MM.00B813 0.0132938 0.1460011 0.6040098 0.1790479 0.0706359
MM.63B200 0.1651734 0.1694572 0.5846569 0.3462267 0.1601311
MM.A3A500 -0.1722826 0.1903747 0.5483385 0.3127235 -0.0023800
MM.00B2F2 -0.0958665 0.2050325 0.5628584 0.0988988 -0.1448892
MM.FE6E89 -0.2258332 0.1795930 0.5157362 0.1972410 -0.2019764
MM.9B8EFF 0.0000429 0.2379469 0.3543707 0.6145468 -0.0369566
MM.F37C58 -0.3405722 0.2391375 0.3777951 0.3933602 -0.0841988
MM.E76BF3 0.2590833 0.0291549 0.3382920 0.2467875 -0.2286062
MM.FF68A2 -0.2010435 0.1146447 0.3463303 0.1604674 -0.0712511
MM.FF6A9A -0.2404087 0.0764843 0.1524623 0.1295565 -0.1168925
MM.B285FF 0.0708652 0.0495138 0.3861183 0.3738267 -0.0348559
MM.EC69EE 0.0344338 0.0792035 0.1817272 0.5724684 0.0207306
absGS 0.0946359 0.2088418 0.4470894 0.2260618 0.1485608
SFARI 0.0000000 0.0000000 1.0000000 0.0000000 0.0000000
rm(table_info)
original_dataset = dataset
dataset = new_dataset

rm(new_dataset)

The final dataset contains 13628 observations (genes) and 99 variables



Exploratory Analysis


PCA of Variables


The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups (just like with Gandal’s, Gupta’s and Wright’s datasets)

pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))


mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)


PCA of Samples


  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • SFARI Genes seem to be evenly distributed everywhere

  • It’s not clear what the 2nd principal component is capturing, perhaps a bit of mean level of expression?

# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
datMeta = datMeta %>% mutate(ID = paste0('X',description))
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)



Dividing samples into Training and Test Sets


5.02% of the observations are positive. This can be a problem when training the classification model, so the samples in the training set should be balanced between classes before the model is trained.

table_info = dataset %>% apply_labels(SFARI = 'SFARI')

cro(table_info$SFARI)
 #Total 
 SFARI 
   FALSE  12944
   TRUE  684
   #Total cases  13628
rm(table_info)

To divide our samples into training and test sets:

set.seed(123)

sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>% 
                left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                          by = 'ID') %>% 
                mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]


rm(sample_scores, train_idx)


Label distribution in training set


To fix this class imbalance, we are going to use SMOTE, an over-sampling technique that over-samples the minority class (SFARI Genes) by creating synthetic examples, in the training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  9061
   TRUE  480
   #Total cases  9541


Labels distribution in test set


This set is used just to evaluate how well the model performs, so the class imbalance is not a problem here

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  3883
   TRUE  204
   #Total cases  4087


Logistic Regression


Train model

# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5


train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)

k_fold = 10
cv_repeats = 5
smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats, 
                                   verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                   summaryFunction = twoClassSummary, sampling = 'smote')

# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)

# There is some perfect multicollinearity that doesn't let us do the vif analysis, so I'll remove those variables (you cannot use alias in caret::train, so I had to train the model again directly with glm)
ld.vars = attributes(alias(glm(SFARI~., data = train_set, family = 'binomial'))$Complete)$dimnames[[1]]

# Remove the linearly dependent variables variables
formula.new = as.formula( paste('SFARI ~ .', paste(ld.vars, collapse='-'), sep='-') )

# Retrain model without these variables
fit = caret::train(formula.new, data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)


rm(smote_over_sampling, ld.vars, formula.new, k_fold, cv_repeats)


Performance


The model has an AUC of 0.6062

But the features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable, so perhaps it would be better to use another model

# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
                       'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + 
              scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + 
              xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

rm(plot_data)

Correlation plot

corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, 
               tl.pos = 'l', tl.col = '#666666')


Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again




Ridge Regression


Notes:

### DEFINE FUNCTIONS

create_train_test_sets = function(p, seed){
  
  # Get SFARI Score of all the samples so our train and test sets are balanced for each score
  sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
                  left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                             by = 'ID') %>% 
                  mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

  set.seed(seed)
  train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
  
  train_set = dataset[train_idx,]
  test_set = dataset[-train_idx,]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
}



run_model = function(p, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(seed)
  k_fold = 10
  cv_repeats = 5
  smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats,
                                     verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                     summaryFunction = twoClassSummary, sampling = 'smote')
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type = 'prob')
  preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 'AUC' = AUC, 'preds' = preds, 'coefs' =coefs))
}


### RUN MODEL

# Parameters
p = 0.75

n_iter = 25
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)


rm(p, seeds, update_preds, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% 
           left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  10230 2702   12932
   TRUE  443 241   684
   #Total cases  10673 2943   13616
rm(conf_mat)


Accuracy: Mean = 0.759 SD = 0.0133


Precision: Mean = 0.0803 SD = 0.0072


Recall: Mean = 0.3633 SD = 0.0358


F1 score: Mean = 0.1315 SD = 0.0115


ROC Curve: Mean = 0.6167 SD = 0.0206

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(mean(AUC),2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor has a very small coefficient

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% 
                 left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% 
            left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% 
            summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))

coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
              dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>% 
              kable(align = 'cc', 
                    caption = 'Regression Coefficients (filtering MM features)') %>% 
              kable_styling(full_width = F)
Regression Coefficients (filtering MM features)
Feature Coefficient
absGS 0.1383954
GS 0.0794006
MTcor -0.0209554
Intercept -0.4461271


There is a positive relation between the coefficient assigned to the membership of each module and the enrichment (using ORA) in SFARI genes that are assigned to that module

load('./../Data/ORA.RData')

enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
  m_info = enrichment_SFARI[[m]]
  enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
  enrichment_SFARI_info = enrichment_SFARI_info %>% 
                          add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}

plot_data = coef_info %>% dplyr::rename('Module' = feature) %>% 
            left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))

ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) + 
         geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
         geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) + 
         theme_minimal() + xlab('Coefficient') + 
         ylab('SFARI Genes Enrichment'))
rm(enrichment_old_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
   m, m_info, enrichment)


There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]), 
                         alpha = 0.7) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse Results


Score distribution by SFARI Label


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))


Score distribution by SFARI Score


Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be a statistically significant positive relation between the SFARI scores and the probability assigned by the model

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  128
   2  180
   3  376
   Neuronal  840
   Others  12092
   #Total cases  13616
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','Others'), c('2','Neuronal'),
                   c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)

plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .02) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')

rm(mean_vals, increase, base, pos_y_comparisons)


Genes with the highest Probabilities


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:10)

  • High concentration of genes with a SFARI Score of 1

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
                           'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), Probability = round(Probability, 4), 
                    GeneSignificance = round(GeneSignificance, 4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                           gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set') %>% 
             kable_styling(full_width = F)
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
ANAPC1 0.0710 0.2018 #3EA1FF 0.7958 Others
SP4 -0.0995 0.2018 #3EA1FF 0.7919 Others
ZNF649 0.1808 0.6113 #A789FF 0.7905 Others
BRPF3 -0.2706 0.2208 #00B92D 0.7901 Others
SPIN4 0.4077 0.6113 #A789FF 0.7726 Others
ROBO1 -0.0273 0.6113 #A789FF 0.7719 Neuronal
SGCB 0.4277 0.6113 #A789FF 0.7647 Others
MYO10 -0.1091 0.1508 #00A4FF 0.7573 Neuronal
NAGS 0.1168 -0.2138 #FF6C91 0.7546 Others
PCDHB10 0.2804 0.6113 #A789FF 0.7453 Others
RIF1 0.1954 0.6113 #A789FF 0.7426 Others
GABRG1 0.1293 0.1508 #00A4FF 0.7419 Others
USP25 0.2559 0.6113 #A789FF 0.7418 Others
PDGFRA 0.4996 0.6113 #A789FF 0.7402 Others
KCTD3 0.1536 0.0832 #00BD61 0.7391 Others
SPOPL -0.1516 -0.2138 #FF6C91 0.7380 Others
NLGN1 0.3103 0.6113 #A789FF 0.7375 2
MED1 -0.1427 -0.2138 #FF6C91 0.7358 Neuronal
RIN2 0.2519 0.1508 #00A4FF 0.7356 Others
C3orf58 0.2204 0.6113 #A789FF 0.7346 3
SPAST 0.0746 -0.2138 #FF6C91 0.7309 1
SOGA1 0.1969 0.6143 #00AAFD 0.7307 Others
NETO2 0.1194 0.2018 #3EA1FF 0.7302 Others
ELMO2 0.1032 0.6113 #A789FF 0.7287 Others
CWC22 -0.0095 0.2073 #D575FE 0.7279 Others
HIP1 0.2566 0.2407 #FF61C6 0.7270 Others
BACH2 0.0527 0.2208 #00B92D 0.7269 Others
MKL2 0.0595 0.6113 #A789FF 0.7268 3
YES1 0.0980 0.1508 #00A4FF 0.7267 Others
NUDCD1 0.4902 0.6113 #A789FF 0.7253 Others
SASS6 -0.0247 -0.2138 #FF6C91 0.7240 Others
RNF144A 0.0537 0.2208 #00B92D 0.7238 Others
LIN7C 0.1267 -0.2138 #FF6C91 0.7228 Neuronal
SQLE 0.1144 0.2190 #FC7180 0.7209 Others
CDK19 0.3077 0.2018 #3EA1FF 0.7205 Others
FLRT1 -0.1712 0.0400 #B0A100 0.7204 Others
NUP214 -0.2512 0.0400 #B0A100 0.7198 Others
BCL9L -0.1023 0.4586 #D19300 0.7195 Others
NRXN3 -0.1909 -0.1972 #CC9600 0.7193 1
ZNF772 0.3303 0.1508 #00A4FF 0.7190 Others
TJP1 0.1871 0.2208 #00B92D 0.7180 Others
MBLAC2 0.1260 0.2018 #3EA1FF 0.7177 Others
SIRT1 -0.0796 0.2079 #00BDD4 0.7162 Others
SSH2 0.2467 0.4406 #00BB4B 0.7155 Others
ZNF235 0.1638 0.2208 #00B92D 0.7141 Others
HOMER2 -0.1697 -0.3275 #FA7477 0.7138 Neuronal
MAPK1IP1L -0.1771 0.0832 #00BD61 0.7136 Others
FAM135A 0.0284 0.2018 #3EA1FF 0.7126 Others
CWF19L2 -0.0711 0.2079 #00BDD4 0.7102 Others
OXSR1 -0.2093 -0.3275 #FA7477 0.7075 Others





Negative samples distribution


The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')

cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  10230
   TRUE  2702
   #Total cases  12932

2702 genes are predicted as ASD-related


negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()




Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a positive relation with the Gene Significance (under-expressed genes having on average the lower probabilities and over-expressed genes the highest) (this pattern was the opposite in Gandal’s dataset but the same as Gupta’s)
negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + 
                 xlab('Probability') + ylab('Gene Significance') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()




Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign slightly higher probabilities to genes belonging the modules with positive module-Dianosis correlations than to modules with negative Module-Dagnosis correlation

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()




Summarised version, plotting by module instead of by gene


The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

Again, the model seems to give lower probabilities to genes belonging to modules with a high negative correlation to Diagnosis than to the rest

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Module') + theme_minimal())




Probability and level of expression


# Gupta dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame

The relation between mean level of expression of the genes and the probability assigned by the model seems to be negative!

mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) +  
              geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') + 
              xlab('Mean Expression') + ylab('Probability') + 
              ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)

Grouping the genes by module we see the same negative correlation!

plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.6) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') + 
         xlab('Mean Level of Expression') + ylab('Average Model Probability') +
         theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)




Probability and LFC


I’m not sure if there is a relation between LFC and probability, perhaps there is for genes with a positive LFC, but for negative genes, the relation seems like it could be negative

plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=datGenes$ensembl_gene_id), by='ID') %>%
            mutate(log2FoldChange = logFC, padj = adj.P.Val)

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
              xlab('LFC') + ylab('Probability') +
              theme_minimal() + ggtitle('LFC vs model probability by gene')

Differentially expressed genes have a lower probabaility than non-differentally expressed genes

p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>%
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.2) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
     ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
     theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% 
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.2) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
     scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
     theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Conclusion


The results from this model are very different from the ones obtained ine the other three datasets:

  • There is a bias in probability assigned by the model and level of expression, but it is a negative one: the higher the level of expression of a gene, the lower the probability it is assigned by the model

  • There doesn’t seem to be a noticeable relation between LFC and probability assigned by the model

Still, the model seems to be able to identify SFARI Genes, as we can see from the lift curve


Saving results

predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))

save(predictions, dataset, fit, file='./../Data/Ridge_model.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DMwR_0.4.1         doParallel_1.0.15  iterators_1.0.12   foreach_1.5.0     
##  [5] kableExtra_1.1.0   expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1   
##  [9] car_3.0-7          carData_3.0-3      ROCR_1.0-7         gplots_3.0.3      
## [13] caret_6.0-86       lattice_0.20-41    polycor_0.7-10     biomaRt_2.40.5    
## [17] ggpubr_0.2.5       magrittr_1.5       RColorBrewer_1.1-2 gridExtra_2.3     
## [21] viridis_0.5.1      viridisLite_0.3.0  plotly_4.9.2       knitr_1.28        
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.0        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.1.0        tibble_3.0.1       ggplot2_3.3.2     
## [33] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1         backports_1.1.8      plyr_1.8.6          
##   [4] lazyeval_0.2.2       splines_3.6.3        crosstalk_1.1.0.1   
##   [7] digest_0.6.25        htmltools_0.4.0      gdata_2.18.0        
##  [10] fansi_0.4.1          checkmate_2.0.0      memoise_1.1.0       
##  [13] openxlsx_4.1.4       recipes_0.1.10       modelr_0.1.6        
##  [16] gower_0.2.1          matrixStats_0.56.0   xts_0.12-0          
##  [19] prettyunits_1.1.1    colorspace_1.4-1     blob_1.2.1          
##  [22] rvest_0.3.5          haven_2.2.0          xfun_0.12           
##  [25] crayon_1.3.4         RCurl_1.98-1.2       jsonlite_1.7.0      
##  [28] zoo_1.8-8            survival_3.1-12      glue_1.4.1          
##  [31] gtable_0.3.0         ipred_0.9-9          webshot_0.5.2       
##  [34] shape_1.4.4          quantmod_0.4.17      BiocGenerics_0.30.0 
##  [37] abind_1.4-5          scales_1.1.1         DBI_1.1.0           
##  [40] miniUI_0.1.1.1       Rcpp_1.0.4.6         xtable_1.8-4        
##  [43] progress_1.2.2       htmlTable_1.13.3     foreign_0.8-76      
##  [46] bit_1.1-15.2         stats4_3.6.3         lava_1.6.7          
##  [49] prodlim_2019.11.13   glmnet_3.0-2         htmlwidgets_1.5.1   
##  [52] httr_1.4.1           ellipsis_0.3.1       pkgconfig_2.0.3     
##  [55] XML_3.99-0.3         farver_2.0.3         nnet_7.3-14         
##  [58] dbplyr_1.4.2         later_1.0.0          tidyselect_1.1.0    
##  [61] labeling_0.3         rlang_0.4.6          reshape2_1.4.4      
##  [64] AnnotationDbi_1.46.1 munsell_0.5.0        cellranger_1.1.0    
##  [67] tools_3.6.3          cli_2.0.2            generics_0.0.2      
##  [70] RSQLite_2.2.0        broom_0.5.5          fastmap_1.0.1       
##  [73] evaluate_0.14        yaml_2.2.1           ModelMetrics_1.2.2.2
##  [76] bit64_0.9-7          fs_1.4.0             zip_2.0.4           
##  [79] caTools_1.18.0       nlme_3.1-147         mime_0.9            
##  [82] ggExtra_0.9          xml2_1.2.5           compiler_3.6.3      
##  [85] rstudioapi_0.11      curl_4.3             ggsignif_0.6.0      
##  [88] reprex_0.3.0         stringi_1.4.6        highr_0.8           
##  [91] Matrix_1.2-18        vctrs_0.3.1          pillar_1.4.4        
##  [94] lifecycle_0.2.0      data.table_1.12.8    bitops_1.0-6        
##  [97] httpuv_1.5.2         R6_2.4.1             promises_1.1.0      
## [100] KernSmooth_2.23-17   rio_0.5.16           IRanges_2.18.3      
## [103] codetools_0.2-16     MASS_7.3-51.6        gtools_3.8.2        
## [106] assertthat_0.2.1     withr_2.2.0          S4Vectors_0.22.1    
## [109] mgcv_1.8-31          hms_0.5.3            rpart_4.1-15        
## [112] timeDate_3043.102    class_7.3-17         rmarkdown_2.1       
## [115] TTR_0.23-6           pROC_1.16.2          shiny_1.4.0.2       
## [118] Biobase_2.44.0       lubridate_1.7.4